import csv
import pandas as pd
import re

filename = r"C:/Users/blwang14/Desktop/未知用户.txt"  # CSV 文件的路径






def processdata(dataRaw):
    data = dataRaw[0]
    flag1 = False
    flag2 = False
    lastDir = data.split("/")[-1]
    
    if(data.find("/.hive-staging_hive_") == -1):
        flag1 = True

    try:
        regexTest = re.compile(r"\w*?=?(\d{4}).*?(0[1-9]|1[0-2]).*?(0[1-9]|[12]\d|3[01])(?!.*/).*$")
        str1 = regexTest.search(lastDir)
        print(str1.group())
    except:
        regexTest = re.compile(r"^\d+$")
        list1 = regexTest.findall(lastDir)
        if not list1:
            flag2 = True
    
    if flag1:
        if flag2:
            return True



# 打开 CSV 文件
with open(filename, 'r') as file:
    # 创建 CSV 读取器
    reader = csv.reader(file, delimiter='\t')

    processReader =  filter(processdata,reader)

    
    # 逐行读取并输出数据
    # for row in processReader:
    #     print(row)


    # 定义列名
    columns = ['full_name', 'user_name']

    # 创建 DataFrame
    df = pd.DataFrame(processReader, columns=columns)

    print(df)
    df_sorted = df.drop_duplicates().sort_values("full_name")
    # 存储每行的数据
    df_sorted.to_csv('data.csv',sep="\t",index=False)
    